In [174]:
%run dataFormating.ipynb


0.0 RedMetrics data preparation
1. Game sessions
2. Google form analysis
temporalities set (user answer method)
profile info set
2.1 Sampling
3. Per session and per user analysis
4. User comparison

In [175]:
import sklearn
print (sklearn.__version__)


0.19.1

In [176]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LassoCV, Lasso
from sklearn.linear_model import RidgeCV, Ridge

from ipywidgets import FloatProgress
from IPython.display import display

from math import *

from scipy import stats
from scipy.stats.mstats import normaltest

from matplotlib.pyplot import boxplot

Questionnaire only

Can the answers to the scientific questions be used to predict if the questionnaire was filled before or after the game?

Note: I am using only decision tree methods here because other methods like naive bayes do not make sense on categorical data

If scientific questions are coded by answers


In [177]:
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defForms.columns.values) if x[0] == "Q"]

# Pick features and target
features = defForms.loc[:, scientificColumns]
target = defForms["temporality"].astype('int')

In [178]:
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
scores.mean()


Out[178]:
0.871072796934866

In [179]:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()


Out[179]:
0.8990421455938696

In [180]:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()


Out[180]:
0.9045977011494252

Conclusion: Accuracy is around 85%. Not bad but we expected better (17/01/2018)

If scientific questions are coded by correctedness


In [181]:
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defCorrectedForms.columns.values) if x[0] == "Q"]

# Pick features and target
features = defCorrectedForms.loc[:, scientificColumns]
target = defCorrectedForms["temporality"].astype('int')

In [182]:
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
scores.mean()


Out[182]:
0.960536398467433

In [183]:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()


Out[183]:
0.9045977011494252

In [184]:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()


Out[184]:
0.9383141762452109

Conclusion: Accuracy is around 80%. Not bad but we expected better (19/12/2017)

RedMetrics only

RedMetrics data


In [185]:
def getPosttestUserIds(gfdf):
    return gfdf[gfdf[QTemporality] == answerTemporalities[1]][QUserId].unique()

In [186]:
allDataWebgl1522 = prepareAllData(getAllUserVectorData(
    getPosttestUserIds(gfdfWebgl1522UniqueProfiles),
    rmdfWebgl1522UniqueProfiles,
    gfdfWebgl1522UniqueProfiles,
    _source = correctAnswers + demographicAnswers,
    _printDebug=False))



In [187]:
allDataWebgl1522Volunteers = prepareAllData(getAllUserVectorData(
    getAllResponders(gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers),
    rmdfWebgl1522PretestPosttestUniqueProfilesVolunteers,
    gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers,
    _source = correctAnswers + demographicAnswers,
    _printDebug=False))



In [188]:
allDataWebgl160 = prepareAllData(getAllUserVectorData(
    getPosttestUserIds(gfdfWebgl160UniqueProfiles),
    rmdfWebgl160UniqueProfiles,
    gfdfWebgl160UniqueProfiles,
    _source = correctAnswers + demographicAnswers,
    _printDebug=False))



In [189]:
allDataWebgl160Volunteers = prepareAllData(getAllUserVectorData(
    getAllResponders(gfdfWebgl160PretestPosttestUniqueProfilesVolunteers),
    rmdfWebgl160PretestPosttestUniqueProfilesVolunteers,
    gfdfWebgl160PretestPosttestUniqueProfilesVolunteers,
    _source = correctAnswers + demographicAnswers,
    _printDebug=False))



In [190]:
allDataPlaytestPhase2 = prepareAllData(getAllUserVectorData(
    getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
    rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
    gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
    _source = correctAnswers + demographicAnswers,
    _printDebug=False))


Can the score of a player be predicted with their RedMetrics data?


In [191]:
def getAnonymousData(allDataClassif):
    return allDataClassif.drop("anonymousID", axis = 1)

In [192]:
# columns to exclude: contain direct information on posttest score
dropPosttestColumns = allDataClassif.columns & (deltaQuestions + posttestQuestions + ["scoreposttest", "scoredelta"])
dropPretestColumns = allDataClassif.columns & (pretestQuestions + ["scorepretest"])

In [193]:
def getUnscaledFeatures(anonymousData, dropPosttest=True, dropPretest=True):
    # Only select rows where scoreafter is not negative
    result = anonymousData[anonymousData["scoreposttest"] >= 0]
    if dropPosttest:
        result = result.drop(dropPosttestColumns, axis = 1)
    if dropPretest:
        result = result.drop(dropPretestColumns, axis = 1)
    return result

In [194]:
def getFeaturesTarget(allDataClassif, chosenModel = Lasso):
    # Remove id
    anonymousData = getAnonymousData(allDataClassif)

    # Get features and target
    # Only select rows where scoreafter is not negative
    unscaledFeatures = getUnscaledFeatures(anonymousData)
    target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]

    # Center and scale data
    #features = preprocessing.scale(unscaledFeatures)

    # Center and scale data variant
    standardScaler = preprocessing.StandardScaler()
    standardScaler.fit(unscaledFeatures)
    features = standardScaler.transform(unscaledFeatures)
    
    # Run Lasso regression with cross-validation
    model = chosenModel()
    scores = cross_val_score(model, features, target, cv=10)
    boxplot(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    
    model.fit(features, target)    
    
    return scores, standardScaler, model, features, target, unscaledFeatures

In [195]:
scores, standardScaler, model, features, target, unscaledFeatures = getFeaturesTarget(allDataClassif)


Accuracy: 0.26 (+/- 0.35)

In [196]:
def getInvertedCriteria(allData, criteria):
    result = allData.copy()
    
    if not (len(result.columns & criteria) == len(criteria)):
        print("not all criteria are in input columns")
    
    for criterion in criteria:
        result[criterion] = 1 / (1 + result[criterion])
        
    return result

In [197]:
#allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
scoresInv, standardScalerInv, modelInv, featuresInv, targetInv, unscaledFeaturesInv = getFeaturesTarget(allDataClassifInv)


Accuracy: 0.36 (+/- 0.31)

In [198]:
#list(set(allDataClassifInv.columns) - set(['anonymousID']))

In [199]:
#criteria = list(\
#    set(adc.columns)\
#    - set(adc.columns & \
#          (deltaQuestions + posttestQuestions
#           + pretestQuestions
#           + ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
#           + ["scorepretest"]
#           + predefinedCriteria
#          ))\
#)
getScoresMean(allDataClassifInv, list(set(allDataClassifInv.columns) 
                                      - set(
                                          deltaQuestions
                                          + posttestQuestions
                                          + pretestQuestions
                                          + ["scorepretest", "scoreposttest", "scoredelta", 'scoreundefined', 'anonymousID']
                                      )))


Out[199]:
0.36179931762866546

Prediction of a single score


In [200]:
def getPrediction(standardScaler, model, unscaledX):
    X = standardScaler.transform([unscaledX])
    return model.predict(X)[0]

In [201]:
def getPredictionVsActual(standardScaler, model, allDataClassif):
    unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassif))
    
    result = pd.DataFrame(index = unscaledFeatures.index, columns=["predicted", "actual", "error"], data = -1)

    for userId in unscaledFeatures.index:
        unscaledX = unscaledFeatures.loc[userId].values
        actualScore = allDataClassif.loc[userId, "scoreposttest"]

        result.loc[userId, "predicted"] = getPrediction(standardScaler, model, unscaledX)
        result.loc[userId, "actual"] = actualScore
        result.loc[userId, "error"] = result.loc[userId, "predicted"] - result.loc[userId, "actual"]
        
    r2Coef = model.score(standardScaler.transform(unscaledFeatures), result["actual"].values)
        
    return result, r2Coef

use allData from online campaigns


In [202]:
samples = [allDataWebgl1522,
           allDataWebgl1522Volunteers,
           allDataWebgl160,
           allDataWebgl160Volunteers,
           allDataPlaytestPhase2
          ]

for sample in samples:
    _allDataClassif = getAllDataClassif(sample)
    result, r2Coef = getPredictionVsActual(standardScaler, model, _allDataClassif)
    print("{0:0=2d}".format(len(_allDataClassif)) + ":     " + str(r2Coef))
    
    _allDataClassifInv = getAllDataClassif(getInvertedCriteria(sample, totalTimesCriteria + completionTimesCriteria))
    resultInv, r2CoefInv = getPredictionVsActual(standardScalerInv, modelInv, _allDataClassifInv)
    print("{0:0=2d}".format(len(_allDataClassifInv)) + " inv: " + str(r2CoefInv))


15:     0.4886853012762777
15 inv: 0.44071056286415916
02:     0.5461626808214248
02 inv: 0.6872632557580105
29:     -0.43131922279814594
29 inv: -0.7094535923362839
01:     0.0
01 inv: 0.0
30:     0.052972596347304
30 inv: -0.06338921091904726

Determining the most important variables


In [203]:
def getLassoModelCoefficients(model, unscaledFeatures, useAbs = True):
    nonNullIndices = np.nonzero(model.coef_)    
    data = model.coef_[nonNullIndices]
    if useAbs:
        data = abs(data)    
    lassoModelParameters = pd.Series(
        index = unscaledFeatures.columns[nonNullIndices],
        data = data
    ).sort_values()
    return lassoModelParameters

In [204]:
getLassoModelCoefficients(model, unscaledFeatures)


Out[204]:
start             9.580351e-16
sessionsCount     5.687725e-02
ch06total         9.111911e-02
completionTime    1.942977e-01
maxChapter        2.689778e-01
ch04completion    6.087838e-01
ch05completion    9.165130e-01
ch07completion    1.781864e+00
dtype: float64

In [205]:
getLassoModelCoefficients(modelInv, unscaledFeatures)


Out[205]:
start             2.235415e-15
sessionsCount     1.596591e-02
ch06completion    1.669966e-01
ch02completion    8.394467e-01
ch07total         9.338385e-01
ch05total         2.005835e+00
dtype: float64

In [206]:
#unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassifWebgl160Volunteers))
#unscaledX = unscaledFeatures.iloc[0].values
#X = standardScaler.transform([unscaledX])
#model.predict(X)[0]

#X = (unscaledX - standardScaler.mean_) / standardScaler.scale_
#model.predict([X])[0]

#np.dot(model.coef_, X) + model.intercept_

Prediction of all scores


In [207]:
if False:
    anonymousData = getAnonymousData(allDataClassif)
    
    sortedUnscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0].sort_values(by="scoreposttest").drop(dropPosttestColumns, axis = 1)
    sortedTarget = sorted(anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"])

    # Center and scale data variant
    sortedFeatures = standardScaler.transform(sortedUnscaledFeatures)

    x = range(len(sortedFeatures))
    alpha = 0.5

    fig, ax = plt.subplots()
    plt.title('Actual vs predicted score')
    plt.xlabel('User index')
    plt.ylabel('Score')
    #plt.plot(x, model.predict(sortedFeatures), kind = 'bar')
    #plt.plot(x, sortedTarget)
    ax.bar(x, model.predict(sortedFeatures), alpha=alpha, label='predicted', linewidth=0)
    ax.bar(x, sortedTarget,                  alpha=alpha, label='actual')
    ax.legend()
    fig.tight_layout()
    plt.show()

Conclusion: Score cannot be predicted by the table of RedMetrics data (19/07/2018).

Second degree polynomial


In [208]:
def getFeaturesTargetSecondDegreePolynomial(allDataClassif, chosenModel = Lasso):
    # Remove id
    anonymousData = getAnonymousData(allDataClassif)

    # Get features and target
    # Only select rows where scoreafter is not negative
    unscaledFeatures = getUnscaledFeatures(anonymousData)
    target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]

    # Add polynomial features
    secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
    unscaledFeatures = secondDegreeFeatures.fit_transform(unscaledFeatures)

    # Center and scale data variant
    standardScaler = preprocessing.StandardScaler()
    standardScaler.fit(unscaledFeatures)
    features = standardScaler.transform(unscaledFeatures)
    
    # Run Lasso regression with cross-validation
    model = chosenModel()
    scores = cross_val_score(model, features, target, cv=10)
    boxplot(scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    
    model.fit(features, target)
    
    return scores, standardScaler, model, features, target, unscaledFeatures

In [209]:
scores1, standardScaler1, model1, features1, target1, unscaledFeatures1 = getFeaturesTargetSecondDegreePolynomial(allDataClassif);
scores2, standardScaler2, model2, features2, target2, unscaledFeatures2 = getFeaturesTargetSecondDegreePolynomial(allDataClassifInv);


Accuracy: 0.26 (+/- 0.38)
Accuracy: 0.34 (+/- 0.33)

Conclusion: Score cannot be predicted by the table of RedMetrics data + second degree polynomial (30/01/2018)

Let's try by reducing the number of features


In [210]:
# Remove id
anonymousData = getAnonymousData(allDataClassifInv)

# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]

#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"] + totalTimesCriteria + completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"]]
#unscaledFeatures = unscaledFeatures[totalTimesCriteria]
#unscaledFeatures = unscaledFeatures[completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["maxChapter", "ch05completion", "ch07completion", "ch07total", "ch09total"]]
#unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
#unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
if False:#'columnsForRegression' in globals():
    unscaledFeatures = unscaledFeatures[columnsForRegression]
else:
#    unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
#    unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
    unscaledFeatures = unscaledFeatures[['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total']]

target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]

# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(unscaledFeatures)

# Center and scale data
features = preprocessing.scale(unscaledFeatures)

# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

model.fit(features, target)

getLassoModelCoefficients(model, unscaledFeatures)


Accuracy: 0.39 (+/- 0.28)
Out[210]:
ch02completion    0.880111
ch07total         1.024656
ch05total         2.023169
dtype: float64

In [211]:
def getScoresMean(allDataClassif, columnsSubset):
    anonymousData = getAnonymousData(allDataClassif)
    unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]
    unscaledFeatures = unscaledFeatures[columnsSubset]
    target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
    secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
    features = secondDegreeFeatures.fit_transform(unscaledFeatures)
    features = preprocessing.scale(unscaledFeatures)
    model = Lasso()
    scores = cross_val_score(model, features, target, cv=10)
    return scores.mean()

In [212]:
# number of possibles subsets of size n of a set of size 96
import scipy.special
scipy.special.binom(96, 3),\
scipy.special.binom(96, 4),\
scipy.special.binom(96, 5),\
scipy.special.binom(96, 6),\
scipy.special.binom(96, 7),\
scipy.special.binom(96, 8)


Out[212]:
(142880.0, 3321960.0, 61124064.0, 927048304.0, 11919192480.0, 132601016340.0)

In [213]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')


Out[213]:
To toggle on/off output_stderr, click here.

In [214]:
def getETA(computations, timestamp):
    # computing speed: computations per second
    computationSpeed = 2794155 / 42338
    duration = computations / computationSpeed
    eta = timestamp + pd.Timedelta(seconds = duration)
    return eta

In [ ]:
import itertools
import time
import scipy.special
import warnings
from ipywidgets import Textarea, FloatText, ToggleButton, Checkbox
warnings.filterwarnings('ignore')


#adc = allDataClassif.copy()
adc = allDataClassifInv.copy()

# criteria with pretest info
#predefinedCriteria = ['ch05completion', 'scorepretest', 'pretest Want to learn more about Biology', 'ch07total', 'ch05total',]
# criteria with only RM info
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
predefinedCriteria = ['ch06completion', 'ch02completion', 'ch07total', 'ch05total', ]

criteria = list(\
    set(adc.columns)\
    - set(adc.columns & \
          (deltaQuestions + posttestQuestions
           + pretestQuestions
           + ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
           + ["scorepretest"]
           + predefinedCriteria
          ))\
)

subsetSize = 4
combinations = scipy.special.binom(len(criteria), subsetSize)
print("#combinations="+str(combinations))
print("ETA " + str(getETA(combinations, pd.Timestamp.now())))

In [ ]:
if True:
    
     # very long computation time: > 10h
    maxScore = 0.36
    i = 0
    columnsForRegression = []
    iterations = combinations+2

    _progress = IntProgress(min=0, max=iterations)
    _intText = IntText(0)
    _currentBest = FloatText(0.0)
    _currentCriteria = Textarea("")
    #_stopButton = ToggleButton(value=False, description='Stop')
    #_stopCheckbox = Checkbox(value=False, description='Stop')

    display(_progress)
    display(_intText)
    display(_currentBest)
    display(_currentCriteria)
    #display(_stopButton)
    #display(_stopCheckbox)

    iterator = itertools.combinations(criteria, subsetSize)

    start_time = time.time()
    for columnsSubset in iterator:
        #if _stopButton.value or _stopCheckbox.value or (i >= iterations):
        if (i >= iterations):
            break
        else:
            i += 1
            _progress.value += 1
            _intText.value+= 1
            score = getScoresMean(adc, list(columnsSubset) + predefinedCriteria)
            if score > maxScore:
                maxScore = score
                _currentBest.value = score
                columnsForRegression = list(columnsSubset) + predefinedCriteria
                _currentCriteria.value = str(columnsForRegression)

    print("--- executed %s / %s in %s seconds ---" % (i, combinations, time.time() - start_time))
    print("--- end time: " + str(pd.Timestamp.now()))

    maxScore, columnsForRegression

In [217]:
# how long to compute all
(17 * 61124064 / 1000) / 3600,\
(249 * 57940519 / 15000) / 3600,\
(204 * 57940519 / 15000) / 3600,\


Out[217]:
(288.64141333333333, 267.17017094444446, 218.88640511111112)

In [218]:
# how much computed in some duration
durationSeconds = 5 * 60
durationSeconds * 1000 / 17


Out[218]:
17647.058823529413

allDataClassif

0.389994800369642, ['ch12completion', 'totalTime', 'pretest Studied biology']

0.3899953439583282, ['scoreundefined', 'pretest Want to learn more about Biology', 'ch12total', 'ch10total', 'pretest Name: PR')]

allDataClassifInv

0.39870229095205095, ['pretest Want to learn more about Biology', 'ch01completion', 'reach', 'pretest Example: CDS', 'pretest Played Hero.Coli']

['pretest Want to learn more about Biology', 'ch01completion', 'pretest Device: PCONS:RBS:FLHDC:TER', 'ch05total', 'scorepretest']

Conclusion: Tried different combinations, but cannot find any interesting regression (02/02/2018)

Questionnaire and RedMetrics

Can the biology level of a player be predicted using the game data?


In [219]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)

# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData[anonymousData["scoreposttest"] >= 0]
features = features.loc[:,"sessionsCount":"completionTime"]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["biologyStudy"]

# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)

# Center and scale data
features = preprocessing.scale(features)

In [220]:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores


Out[220]:
array([-0.01252511, -0.66992188, -0.09018179, -0.51773437, -0.4134375 ,
       -0.03507031, -0.0528125 , -0.09191176, -0.56165698, -0.02340812])

Conclusion: No (30/01/2018)

Can the gaming profile of a player be predicted using the game data?


In [221]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)

# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData.loc[:,"sessionsCount":"completionTime"]
target = sum(anonymousData["gameInterest"], anonymousData["gameFrequency"])

# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)

# Center and scale data
features = preprocessing.scale(features)

In [222]:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores


Out[222]:
array([-0.03111364, -1.4535125 , -0.5184398 , -0.09571289, -0.23198411,
       -0.00591202, -0.00249498, -0.15316201, -0.02673828, -0.18439203])

Conclusion: No (30/01/2018)

Can the completion time of each chapter be used to predict if a player is going to answer a specific scientific question correctly?


In [254]:
# Given a question tag, plot scores of cross-validated model
def tryClassification(data, scientificQuestion):
    # Remove id
    anonymousData = data.drop("anonymousID", axis = 1)

    # Get features and target
    # Only select rows where scoreafter is not negative
    features = anonymousData[anonymousData["scoreposttest"] >= 0]
    #features = features.iloc[:,24:37]
    features = features.loc[:,criteria]
    target = anonymousData[anonymousData["scoreposttest"] >= 0].loc[:,scientificQuestion].astype('int')

    # Add polynomial features
    secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
    features = secondDegreeFeatures.fit_transform(features)

    # Center and scale data
    features = preprocessing.scale(features)
    
    # Classify using extra tree classifiers, more random than random forest methods
    clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
    scores = cross_val_score(clf, features, target, cv=5)
    
    # Display plot
    fig, ax = plt.subplots()
    boxplot(scores)
    
    return [scores.mean(), scores.std()]

In [224]:
scientificQuestionsDescrs = correctAnswers[correctAnswers.apply(len) != 0].index.values.tolist()
#scientificQuestionsDescrs

In [225]:
anonymousData.columns[24:37]


Out[225]:
Index(['ch01completion', 'ch02completion', 'ch03completion', 'ch04completion',
       'ch05completion', 'ch06completion', 'ch07completion', 'ch08completion',
       'ch09completion', 'ch10completion', 'ch11completion', 'ch12completion',
       'ch13completion'],
      dtype='object')

In [226]:
set(criteria) in set(anonymousData.columns)#[24:37]


Out[226]:
False

In [227]:
#[c for c in criteria if c not in anonymousData.columns]
#[c for c in anonymousData.columns if c not in criteria]
#anonymousData[criteria]

In [255]:
allScores = pd.DataFrame(index = ["Mean", "Var"])
for question in scientificQuestions:# ["QGenotypePhenotype", "QBioBricksDevicesComposition", "QAmpicillin", "QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator", "QDeviceRbsPconsFlhdcTer", "QDevicePconsRbsFlhdcTer", "QDevicePbadRbsGfpTer", "QDevicePbadGfpRbsTer", "QDeviceGfpRbsPconsTer", "QDevicePconsGfpRbsTer", "QDeviceAmprRbsPconsTer", "QDeviceRbsPconsAmprTer", "QGreenFluorescence", "QUnequipDevice", "QDevicePbadRbsAraTer"]:
    questionTag = question
    scores = tryClassification(gameAndCorrectedAfterDataClassif, questionTag)
    allScores[questionTag] = scores
allScores.columns = scientificQuestionsDescrs
allScores.T


Out[255]:
Mean Var
Genotype and phenotype 0.740832 0.049212
BioBricks and devices composition 0.607190 0.097480
Ampicillin antibiotic 0.637702 0.128454
Name: Plasmid 0.460784 0.022068
Function: TER 0.639628 0.030997
Name: PR 0.697626 0.050671
Function - game: CDS 0.799381 0.068934
Name: TER 0.571999 0.062974
Function - biology: CDS 0.730719 0.039030
Name: RBS 0.845201 0.045501
Example: CDS 0.663158 0.027511
Name: CDS 0.709150 0.070485
Function: PR 0.786275 0.024068
Function: RBS 0.650327 0.107357
Function: Plasmid 0.685621 0.095956
Name: Operator XXX 0.955556 0.022222
Device: RBS:PCONS:FLHDC:TER XXX 0.759993 0.106094
Device: PCONS:RBS:FLHDC:TER 0.675163 0.078557
Device: PBAD:RBS:GFP:TER 0.776711 0.053103
Device: PBAD:GFP:RBS:TER XXX 0.629412 0.054863
Device: GFP:RBS:PCONS:TER XXX 0.696457 0.091441
Device: PCONS:GFP:RBS:TER XXX 0.684967 0.047022
Device: AMPR:RBS:PCONS:TER XXX 0.729412 0.092117
Device: RBS:PCONS:AMPR:TER XXX 0.810458 0.074205
Green fluorescence 0.685208 0.026824
Unequip the movement device: effect 0.630065 0.071699
Device: PBAD:RBS:ARA:TER 0.876471 0.041306

Conclusion: Redmetrics can be used to predict answers to certain scientific questions (29/05/2018) TODO Raphael: Check which questions you want additional analysis for


In [256]:
#from scipy import stats
stats.describe(allScores.loc['Mean',:])


Out[256]:
DescribeResult(nobs=27, minmax=(0.46078431372549017, 0.9555555555555555), mean=0.708720967269299, variance=0.010106830153643127, skewness=0.1611369349868788, kurtosis=0.7030018068448562)

Can the game data be used to predict the performance on a sub-group of scientific questions?


In [230]:
def getBoxplot(scores, title = ''):
    # figure related code
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.boxplot(scores)
    ax.set_title(title)

In [231]:
#pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1).columns

In [232]:
#anonymousData.columns.values

In [233]:
ingameCriteria = ['sessionsCount', 'scoreposttest', 'scoreundefined', 'complete',
       'configure', 'craft', 'death', 'equip', 'unequip', 'add', 'remove',
       'gotourl', 'pickup', 'reach', 'restart', 'selectmenu', 'start',
       'scoredelta', 'maxChapter', 'efficiency', 'thoroughness', 'fun',
       'completionTime', 'ch00completion', 'ch01completion',
       'ch02completion', 'ch03completion', 'ch04completion',
       'ch05completion', 'ch06completion', 'ch07completion',
       'ch08completion', 'ch09completion', 'ch10completion',
       'ch11completion', 'ch12completion', 'ch13completion',
       'ch14completion', 'ch00total', 'ch01total', 'ch02total',
       'ch03total', 'ch04total', 'ch05total', 'ch06total', 'ch07total',
       'ch08total', 'ch09total', 'ch10total', 'ch11total', 'ch12total',
       'ch13total', 'ch14total', 'totalTime']

In [234]:
# boxplot function
#  questions: array of strings of question names
def getPerformanceFromQuestionGroup(questions,
                                    thresholdPercentage = 1.0,
                                    extraTreesClassifier = False,
                                    randomForestClassifier = False,
                                    lasso = False,
                                    histTarget = 0
                                   ):
    # Remove id
    anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)

    # Get features and target
    #features = pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1)
    features = anonymousData.loc[:,ingameCriteria]
    
    digitalTarget = anonymousData.loc[:, questions].astype(int).sum(axis=1)
    categoricalTarget = digitalTarget.apply(lambda x: 0 if x < thresholdPercentage*len(questions) else 1)

    # Add polynomial features
    secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
    features = secondDegreeFeatures.fit_transform(features)

    # Center and scale data
    features = preprocessing.scale(features)

    if extraTreesClassifier:
        # Classify using extra tree classifiers, more random than random forest methods
        clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
        scores = cross_val_score(clf, features, categoricalTarget, cv=10)
        print("ExtraTreesClassifier scores mean: " + str(scores.mean()))

        # Display plot
        getBoxplot(scores, "ExtraTreesClassifier boxplot")
        
    if randomForestClassifier:
        # Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
        clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
        scores = cross_val_score(clf, features, categoricalTarget)
        print("RandomForestClassifier scores mean: " + str(scores.mean()))

        # Display plot
        getBoxplot(scores, "RandomForestClassifier boxplot")
        
    if lasso:
        # Run Lasso regression with cross-validation
        model = Lasso()
        scores = cross_val_score(model, features, digitalTarget, cv=10)
        print("Lasso scores mean: " + str(scores.mean()))

        # Display plot
        getBoxplot(scores, "Lasso boxplot")
        
    if histTarget > 0:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(target, bins = range(histTarget))

Using an arbitrary classification of questions

Hard questions


In [235]:
hardQuestions = ["QBBFunctionPR", "QBBNameOperator", "QDevicePbadRbsAraTer"]
getPerformanceFromQuestionGroup(hardQuestions, thresholdPercentage = 0.5, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)


ExtraTreesClassifier scores mean: 0.9133333333333333
RandomForestClassifier scores mean: 0.8887652947719689
Lasso scores mean: -0.16786699561403515

Conclusion: Very high quality prediction (29/05/18)

Biobrick symbol recognition


In [236]:
bbSymbolRecognition = ["QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator"]
getPerformanceFromQuestionGroup(bbSymbolRecognition, thresholdPercentage = 0.6, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)


ExtraTreesClassifier scores mean: 0.8463888888888889
RandomForestClassifier scores mean: 0.8651340996168582
Lasso scores mean: 0.6074146291292105

Conclusion: No apparent possible prediction (1/02/2018)

Easy questions


In [237]:
easyQuestions = ["QBioBricksDevicesComposition", "QDeviceRbsPconsFlhdcTer", "QGreenFluorescence"]
getPerformanceFromQuestionGroup(easyQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.7913888888888889
RandomForestClassifier scores mean: 0.8091954022988506
Lasso scores mean: -0.12254100404919546

Conclusion: Inconclusive (01/02/2018)

Using Bloom's taxonomy

Not interpreted yet.

knowledge questions


In [238]:
knowledgeQuestions = ["QAmpicillin",
                      "QBBNamePlasmid",
                      "QBBNamePromoter",                      
                      "QBBNameTerminator",
                      "QBBNameRBS",
                      "QBBNameCDS",
                      "QBBNameOperator",
                     ]
getPerformanceFromQuestionGroup(knowledgeQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.9488888888888889
RandomForestClassifier scores mean: 0.9551724137931035
Lasso scores mean: 0.013320744375209104

comprehension questions


In [239]:
comprehensionQuestions = ["QBioBricksDevicesComposition",
                      "QBBFunctionTER",
                      "QBBFunctionPlasmid",                      
                      "QUnequipDevice",
                     ]
getPerformanceFromQuestionGroup(comprehensionQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.7686111111111111
RandomForestClassifier scores mean: 0.7513409961685823
Lasso scores mean: -0.0128011471284121

application questions


In [240]:
applicationQuestions = ["QGenotypePhenotype",
                      "QBBExampleCDS",
                      "QGreenFluorescence",
                     ]
getPerformanceFromQuestionGroup(applicationQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.7130555555555556
RandomForestClassifier scores mean: 0.7530589543937708
Lasso scores mean: -0.2942080226861046

analysis questions


In [241]:
analysisQuestions = ["QBBFunctionGameCDS",
                      "QBBFunctionBiologyCDS",
                      "QBBFunctionPR",
                      "QBBFunctionRBS",
                      "QDevicePbadRbsAraTer",
                     ]
getPerformanceFromQuestionGroup(analysisQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.9141666666666666
RandomForestClassifier scores mean: 0.921455938697318
Lasso scores mean: -0.18327559560994056

synthesis questions


In [242]:
synthesisQuestions = ["QDeviceRbsPconsFlhdcTer",
                      "QDevicePconsRbsFlhdcTer",
                      "QDevicePbadRbsGfpTer",                      
                      "QDevicePbadGfpRbsTer",
                      "QDeviceGfpRbsPconsTer",
                      "QDevicePconsGfpRbsTer",
                      "QDeviceAmprRbsPconsTer",
                      "QDeviceRbsPconsAmprTer",
                     ]
getPerformanceFromQuestionGroup(synthesisQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)


ExtraTreesClassifier scores mean: 0.8772222222222222
RandomForestClassifier scores mean: 0.8766283524904215
Lasso scores mean: 0.636794245829142

Can the completion time be predicted from questionnaire answers?

From the before questionnaire


In [243]:
# Remove id
anonymousData = gameAndCorrectedBeforeDataClassif.drop("anonymousID", axis = 1)

# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
    if potentialLastColumn in anonymousData.columns:
        lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]

# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)

# Center and scale data
features = preprocessing.scale(features)

In [244]:
# Run Lasso regression with cross-validation
model = Lasso(max_iter=10000, alpha=10)
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores.mean()


Out[244]:
-0.218501543355942

In [245]:
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long

# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
    
# Display plot
boxplot(scores)
scores.mean()
sum(target)/len(target)


Out[245]:
0.6853932584269663

Conclusion: No (01/02/2018)

From the after questionnaire


In [246]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)

# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
    if potentialLastColumn in anonymousData.columns:
        lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]

# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)

# Center and scale data
features = preprocessing.scale(features)

In [247]:
# Run Lasso regression with cross-validation
model = Lasso(max_iter=1000000)
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores.mean()


Out[247]:
-0.1416965780356704

In [248]:
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long

# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
    
# Display plot
boxplot(scores)
scores.mean()


Out[248]:
0.8352777777777778

Conclusion: Yes (29/05/18)


In [249]:
sum(target)


Out[249]:
61

In [250]:
len(target)


Out[250]:
89

Conclusion: Yes but very unbalanced classes (29/05/18)


In [ ]: